{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d36f3cd2-090c-4252-8eb6-cb11dd9c6611",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain_community.graphs import Neo4jGraph\n",
    "\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\"\n",
    "\n",
    "llm = ChatOpenAI(model_name=\"gpt-4-turbo\", request_timeout=180)\n",
    "DEMO_URL = \"neo4j+s://demo.neo4jlabs.com\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f15332f7-e82d-49aa-9571-2b31e5ad0a04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>custom_id</th>\n",
       "      <th>response</th>\n",
       "      <th>error</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>batch_req_O0iYejkKUIm6PySDuP16SxOC</td>\n",
       "      <td>request-0</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'f6cdf9300c...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>batch_req_Vi2n7Z6o9c1luzKoV1FTE1Bd</td>\n",
       "      <td>request-1</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'c9ded535e5...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>batch_req_WLQwdwd23ZecEiCM4ZCxw6mk</td>\n",
       "      <td>request-2</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'ff25238d19...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>batch_req_q6lB2BIxq1VcSCKxzaAevu4W</td>\n",
       "      <td>request-3</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'eab91ac91f...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>batch_req_YKy7DWW8b7gCi4ZySgeZIwNJ</td>\n",
       "      <td>request-4</td>\n",
       "      <td>{'status_code': 200, 'request_id': '2f4e70c3f5...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9819</th>\n",
       "      <td>batch_req_63Ofjq5l6Lecs3XLLgn1EjGV</td>\n",
       "      <td>request-9819</td>\n",
       "      <td>{'status_code': 200, 'request_id': '017c608a8d...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9820</th>\n",
       "      <td>batch_req_bCG36LwfGPmY6ggtZN9zSqxf</td>\n",
       "      <td>request-9820</td>\n",
       "      <td>{'status_code': 200, 'request_id': '38723b5f3f...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9821</th>\n",
       "      <td>batch_req_Y8OJIPS2s6IBENAtSWZYsdbT</td>\n",
       "      <td>request-9821</td>\n",
       "      <td>{'status_code': 200, 'request_id': '67b17a4454...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9822</th>\n",
       "      <td>batch_req_b4gtfaiDPA2ABb45hQq88YB4</td>\n",
       "      <td>request-9822</td>\n",
       "      <td>{'status_code': 200, 'request_id': '9f10be8911...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9823</th>\n",
       "      <td>batch_req_TzEGYm0Ym5NWfzYKjQFtfK9r</td>\n",
       "      <td>request-9823</td>\n",
       "      <td>{'status_code': 200, 'request_id': '087a304247...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9824 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      id     custom_id  \\\n",
       "0     batch_req_O0iYejkKUIm6PySDuP16SxOC     request-0   \n",
       "1     batch_req_Vi2n7Z6o9c1luzKoV1FTE1Bd     request-1   \n",
       "2     batch_req_WLQwdwd23ZecEiCM4ZCxw6mk     request-2   \n",
       "3     batch_req_q6lB2BIxq1VcSCKxzaAevu4W     request-3   \n",
       "4     batch_req_YKy7DWW8b7gCi4ZySgeZIwNJ     request-4   \n",
       "...                                  ...           ...   \n",
       "9819  batch_req_63Ofjq5l6Lecs3XLLgn1EjGV  request-9819   \n",
       "9820  batch_req_bCG36LwfGPmY6ggtZN9zSqxf  request-9820   \n",
       "9821  batch_req_Y8OJIPS2s6IBENAtSWZYsdbT  request-9821   \n",
       "9822  batch_req_b4gtfaiDPA2ABb45hQq88YB4  request-9822   \n",
       "9823  batch_req_TzEGYm0Ym5NWfzYKjQFtfK9r  request-9823   \n",
       "\n",
       "                                               response  error  \n",
       "0     {'status_code': 200, 'request_id': 'f6cdf9300c...    NaN  \n",
       "1     {'status_code': 200, 'request_id': 'c9ded535e5...    NaN  \n",
       "2     {'status_code': 200, 'request_id': 'ff25238d19...    NaN  \n",
       "3     {'status_code': 200, 'request_id': 'eab91ac91f...    NaN  \n",
       "4     {'status_code': 200, 'request_id': '2f4e70c3f5...    NaN  \n",
       "...                                                 ...    ...  \n",
       "9819  {'status_code': 200, 'request_id': '017c608a8d...    NaN  \n",
       "9820  {'status_code': 200, 'request_id': '38723b5f3f...    NaN  \n",
       "9821  {'status_code': 200, 'request_id': '67b17a4454...    NaN  \n",
       "9822  {'status_code': 200, 'request_id': '9f10be8911...    NaN  \n",
       "9823  {'status_code': 200, 'request_id': '087a304247...    NaN  \n",
       "\n",
       "[9824 rows x 4 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the JSONL file into a DataFrame\n",
    "df = pd.read_json(\"text2cypher_batch_output.jsonl\", lines=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "6bb1edea-61e2-4ca4-829a-ede5322d9a0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_text(text):\n",
    "    # Adjust pattern to capture after ```cypher and spans multiple lines until ```\n",
    "    pattern = r\"```cypher\\n(.*?)\\n```\"\n",
    "    match = re.search(pattern, text, re.DOTALL)\n",
    "\n",
    "    if match:\n",
    "        # Return the extracted text if triple backticks are present\n",
    "        return match.group(1).strip()\n",
    "    else:\n",
    "        # Return the original text if triple backticks are not present\n",
    "        return text\n",
    "\n",
    "\n",
    "df[\"cypher\"] = [\n",
    "    extract_text(el[\"body\"][\"choices\"][0][\"message\"][\"content\"])\n",
    "    for el in df[\"response\"]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "c56e77d7-3885-4099-80fb-ee4cdd0b614b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>type</th>\n",
       "      <th>database</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What are the top 5 movies with a runtime great...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "      <td>request-0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>List the first 3 directors born before 1950.</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "      <td>request-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Which 5 users have rated more than 20 movies?</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "      <td>request-2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Identify the top 5 actors who have acted in mo...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "      <td>request-3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What are the top 3 genres associated with movi...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "      <td>request-4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  What are the top 5 movies with a runtime great...   \n",
       "1       List the first 3 directors born before 1950.   \n",
       "2      Which 5 users have rated more than 20 movies?   \n",
       "3  Identify the top 5 actors who have acted in mo...   \n",
       "4  What are the top 3 genres associated with movi...   \n",
       "\n",
       "                       type         database         id  \n",
       "0  Simple Retrieval Queries  recommendations  request-0  \n",
       "1  Simple Retrieval Queries  recommendations  request-1  \n",
       "2  Simple Retrieval Queries  recommendations  request-2  \n",
       "3  Simple Retrieval Queries  recommendations  request-3  \n",
       "4  Simple Retrieval Queries  recommendations  request-4  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_df = pd.read_csv(\"text2cypher_questions.csv\")\n",
    "meta_df[\"id\"] = [f\"request-{el}\" for el, _ in enumerate(meta_df[\"database\"])]\n",
    "meta_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "a07acca1-f7df-4864-9d75-e46e7deeff38",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id_x</th>\n",
       "      <th>custom_id</th>\n",
       "      <th>response</th>\n",
       "      <th>error</th>\n",
       "      <th>cypher</th>\n",
       "      <th>question</th>\n",
       "      <th>type</th>\n",
       "      <th>database</th>\n",
       "      <th>id_y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>batch_req_mH0DRhUiLp0OpOrSINn1tDXW</td>\n",
       "      <td>request-483</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'b5547bfdd8...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MATCH (u:User)-[r:INTERACTED]-&gt;(:User)\\nWHERE ...</td>\n",
       "      <td>Who are the top 5 users with the most interact...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>request-483</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>487</th>\n",
       "      <td>batch_req_b8rCrVJYbwnq8hnZNjsPOrLb</td>\n",
       "      <td>request-487</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'e11dbacb98...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MATCH (u:User)-[r:INTERACTED]-&gt;(:User)\\nWHERE ...</td>\n",
       "      <td>List the top 5 users having the most interacti...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>request-487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>486</th>\n",
       "      <td>batch_req_uVxhemAnwHga6d1eWP9526nm</td>\n",
       "      <td>request-486</td>\n",
       "      <td>{'status_code': 200, 'request_id': 'f7bd6ed44c...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MATCH (u:User)\\nWHERE u.color = '#3CB371' AND ...</td>\n",
       "      <td>Which 3 users have the largest size and a colo...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>request-486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>batch_req_6JRQTx9yL2j9R4vboWKhy7gY</td>\n",
       "      <td>request-485</td>\n",
       "      <td>{'status_code': 200, 'request_id': '72e8aaa86f...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MATCH (u:User)-[:INTERACTED]-&gt;(v:User {color: ...</td>\n",
       "      <td>Show the top 5 users who have interacted with ...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>request-485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>batch_req_juKdpCRV7Itlm8u8j9tDmDIp</td>\n",
       "      <td>request-484</td>\n",
       "      <td>{'status_code': 200, 'request_id': '165dc179bd...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MATCH (u:User {label: 'sarahp.bsky.social'})-[...</td>\n",
       "      <td>Find the top 3 users interacted with by 'sarah...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>request-484</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   id_x    custom_id  \\\n",
       "483  batch_req_mH0DRhUiLp0OpOrSINn1tDXW  request-483   \n",
       "487  batch_req_b8rCrVJYbwnq8hnZNjsPOrLb  request-487   \n",
       "486  batch_req_uVxhemAnwHga6d1eWP9526nm  request-486   \n",
       "485  batch_req_6JRQTx9yL2j9R4vboWKhy7gY  request-485   \n",
       "484  batch_req_juKdpCRV7Itlm8u8j9tDmDIp  request-484   \n",
       "\n",
       "                                              response  error  \\\n",
       "483  {'status_code': 200, 'request_id': 'b5547bfdd8...    NaN   \n",
       "487  {'status_code': 200, 'request_id': 'e11dbacb98...    NaN   \n",
       "486  {'status_code': 200, 'request_id': 'f7bd6ed44c...    NaN   \n",
       "485  {'status_code': 200, 'request_id': '72e8aaa86f...    NaN   \n",
       "484  {'status_code': 200, 'request_id': '165dc179bd...    NaN   \n",
       "\n",
       "                                                cypher  \\\n",
       "483  MATCH (u:User)-[r:INTERACTED]->(:User)\\nWHERE ...   \n",
       "487  MATCH (u:User)-[r:INTERACTED]->(:User)\\nWHERE ...   \n",
       "486  MATCH (u:User)\\nWHERE u.color = '#3CB371' AND ...   \n",
       "485  MATCH (u:User)-[:INTERACTED]->(v:User {color: ...   \n",
       "484  MATCH (u:User {label: 'sarahp.bsky.social'})-[...   \n",
       "\n",
       "                                              question  \\\n",
       "483  Who are the top 5 users with the most interact...   \n",
       "487  List the top 5 users having the most interacti...   \n",
       "486  Which 3 users have the largest size and a colo...   \n",
       "485  Show the top 5 users who have interacted with ...   \n",
       "484  Find the top 3 users interacted with by 'sarah...   \n",
       "\n",
       "                          type database         id_y  \n",
       "483  Complex Retrieval Queries  bluesky  request-483  \n",
       "487  Complex Retrieval Queries  bluesky  request-487  \n",
       "486  Complex Retrieval Queries  bluesky  request-486  \n",
       "485  Complex Retrieval Queries  bluesky  request-485  \n",
       "484  Complex Retrieval Queries  bluesky  request-484  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined_df = pd.merge(df, meta_df, left_on=\"custom_id\", right_on=\"id\")\n",
    "combined_df = combined_df.sort_values(by=\"database\")\n",
    "combined_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed1e154f-2d4f-451b-94c5-19840a45c26a",
   "metadata": {},
   "outputs": [],
   "source": [
    "syntax_error = []\n",
    "returns_results = []\n",
    "timeouts = []\n",
    "last_graph = \"\"\n",
    "for i, row in combined_df.reset_index().iterrows():\n",
    "    if i % 100 == 0:\n",
    "        print(i)\n",
    "    # To avoid a new driver for every request\n",
    "    if row[\"database\"] != last_graph:\n",
    "        last_graph = row[\"database\"]\n",
    "        print(last_graph)\n",
    "        graph = Neo4jGraph(\n",
    "            url=DEMO_URL,\n",
    "            username=row[\"database\"],\n",
    "            password=row[\"database\"],\n",
    "            database=row[\"database\"],\n",
    "            refresh_schema=False,\n",
    "            timeout=10,\n",
    "        )\n",
    "    try:\n",
    "        data = graph.query(row[\"cypher\"])\n",
    "        if data:\n",
    "            returns_results.append(True)\n",
    "        else:\n",
    "            returns_results.append(False)\n",
    "        syntax_error.append(False)\n",
    "        timeouts.append(False)\n",
    "    except ValueError as e:\n",
    "        if \"Generated Cypher Statement is not valid\" in str(e):\n",
    "            syntax_error.append(True)\n",
    "            print(f\"Syntax error in Cypher query: {e}\")\n",
    "        else:\n",
    "            syntax_error.append(False)\n",
    "            print(f\"Other ValueError: {e}\")\n",
    "        returns_results.append(False)\n",
    "        timeouts.append(False)\n",
    "    except Exception as e:\n",
    "        if (\n",
    "            e.code\n",
    "            == \"Neo.ClientError.Transaction.TransactionTimedOutClientConfiguration\"\n",
    "        ):\n",
    "            returns_results.append(False)\n",
    "            syntax_error.append(False)\n",
    "            timeouts.append(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "3149b4f3-91d5-49ae-bd52-d0955ebde505",
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_df[\"syntax_error\"] = syntax_error\n",
    "combined_df[\"timeout\"] = timeouts\n",
    "combined_df[\"returns_results\"] = returns_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "7fe4e8e6-34f4-4a1b-83ae-d375f9bd71f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>cypher</th>\n",
       "      <th>type</th>\n",
       "      <th>database</th>\n",
       "      <th>syntax_error</th>\n",
       "      <th>timeout</th>\n",
       "      <th>returns_results</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>Who are the top 5 users with the most interact...</td>\n",
       "      <td>MATCH (u:User)-[r:INTERACTED]-&gt;(:User)\\nWHERE ...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>487</th>\n",
       "      <td>List the top 5 users having the most interacti...</td>\n",
       "      <td>MATCH (u:User)-[r:INTERACTED]-&gt;(:User)\\nWHERE ...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>486</th>\n",
       "      <td>Which 3 users have the largest size and a colo...</td>\n",
       "      <td>MATCH (u:User)\\nWHERE u.color = '#3CB371' AND ...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>Show the top 5 users who have interacted with ...</td>\n",
       "      <td>MATCH (u:User)-[:INTERACTED]-&gt;(v:User {color: ...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>Find the top 3 users interacted with by 'sarah...</td>\n",
       "      <td>MATCH (u:User {label: 'sarahp.bsky.social'})-[...</td>\n",
       "      <td>Complex Retrieval Queries</td>\n",
       "      <td>bluesky</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              question  \\\n",
       "483  Who are the top 5 users with the most interact...   \n",
       "487  List the top 5 users having the most interacti...   \n",
       "486  Which 3 users have the largest size and a colo...   \n",
       "485  Show the top 5 users who have interacted with ...   \n",
       "484  Find the top 3 users interacted with by 'sarah...   \n",
       "\n",
       "                                                cypher  \\\n",
       "483  MATCH (u:User)-[r:INTERACTED]->(:User)\\nWHERE ...   \n",
       "487  MATCH (u:User)-[r:INTERACTED]->(:User)\\nWHERE ...   \n",
       "486  MATCH (u:User)\\nWHERE u.color = '#3CB371' AND ...   \n",
       "485  MATCH (u:User)-[:INTERACTED]->(v:User {color: ...   \n",
       "484  MATCH (u:User {label: 'sarahp.bsky.social'})-[...   \n",
       "\n",
       "                          type database  syntax_error  timeout  \\\n",
       "483  Complex Retrieval Queries  bluesky         False    False   \n",
       "487  Complex Retrieval Queries  bluesky         False    False   \n",
       "486  Complex Retrieval Queries  bluesky         False    False   \n",
       "485  Complex Retrieval Queries  bluesky         False    False   \n",
       "484  Complex Retrieval Queries  bluesky         False    False   \n",
       "\n",
       "     returns_results  \n",
       "483             True  \n",
       "487            False  \n",
       "486             True  \n",
       "485             True  \n",
       "484             True  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_df = combined_df[\n",
    "    [\n",
    "        \"question\",\n",
    "        \"cypher\",\n",
    "        \"type\",\n",
    "        \"database\",\n",
    "        \"syntax_error\",\n",
    "        \"timeout\",\n",
    "        \"returns_results\",\n",
    "    ]\n",
    "]\n",
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "710660aa-7ffb-490d-87fd-423ba9f5e840",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Distribution for syntax_error:\n",
      " syntax_error\n",
      "False    9537\n",
      "True      287\n",
      "Name: count, dtype: int64\n",
      "Distribution for timeout:\n",
      " timeout\n",
      "False    9664\n",
      "True      160\n",
      "Name: count, dtype: int64\n",
      "Distribution for returns_results:\n",
      " returns_results\n",
      "True     7685\n",
      "False    2139\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Assume df is your DataFrame and col1, col2, col3 are the boolean columns\n",
    "distribution_col1 = final_df[\"syntax_error\"].value_counts()\n",
    "distribution_col2 = final_df[\"timeout\"].value_counts()\n",
    "distribution_col3 = final_df[\"returns_results\"].value_counts()\n",
    "\n",
    "print(\"Distribution for syntax_error:\\n\", distribution_col1)\n",
    "print(\"Distribution for timeout:\\n\", distribution_col2)\n",
    "print(\"Distribution for returns_results:\\n\", distribution_col3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "88b10189-c8a2-415f-9a15-3471fe82e714",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.to_csv(\"text2cypher_gpt4turbo.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14d73460-6a08-4e93-bc4a-58bb8c920693",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
